Descriptive Statistics

## # A tibble: 160,318,860 × 7
##    FIPSid          year FIPS_Code_State fips  Enrollment item             amount
##    <chr>          <int> <chr>           <chr>      <int> <chr>             <int>
##  1 00015003250371  2021 01              01003       2221 SchLevCode            3
##  2 00015003250371  2021 01              01003       2221 Total_Revenue     39506
##  3 00015003250371  2021 01              01003       2221 Total_Rev_Own_S…   7576
##  4 00015003250371  2021 01              01003       2221 General_Revenue   39506
##  5 00015003250371  2021 01              01003       2221 Gen_Rev_Own_Sou…   7576
##  6 00015003250371  2021 01              01003       2221 Total_Taxes        5610
##  7 00015003250371  2021 01              01003       2221 Property_Tax       5609
##  8 00015003250371  2021 01              01003       2221 Tot_Sales___Gr_…      0
##  9 00015003250371  2021 01              01003       2221 Total_Gen_Sales…      0
## 10 00015003250371  2021 01              01003       2221 Total_Select_Sa…      0
## # ℹ 160,318,850 more rows
## # A tibble: 616,015 × 7
##    FIPSid         year FIPS_Code_State fips  SchLevCode Enrollment Total_Revenue
##    <chr>         <int> <chr>           <chr>      <int>      <int>         <int>
##  1 000150011778…  1970 01              01001         NA       6800          3226
##  2 000150011778…  1971 01              01001         NA       6800          3548
##  3 000150011778…  1972 01              01001         NA       7163          3579
##  4 000150011778…  1973 01              01001         NA       6800          3758
##  5 000150011778…  1974 01              01001         NA       7163          3642
##  6 000150011778…  1975 01              01001         NA       7163          5312
##  7 000150011778…  1976 01              01001         NA       7163          6111
##  8 000150011778…  1977 01              01001         NA       7179          6382
##  9 000150011778…  1978 01              01001         NA       7163          7265
## 10 000150011778…  1979 01              01001         NA       7271          8648
## # ℹ 616,005 more rows

Complete cases

~ 10% missing values for the Total revenue category

## [1] 0.8947129
## # A tibble: 58 × 265
##    FIPSid         year FIPS_Code_State fips  SchLevCode Enrollment Total_Revenue
##    <chr>         <int> <chr>           <chr>      <int>      <int>         <int>
##  1 21             2002 04              04015          2       2234             0
##  2 252            1997 06              06023          1       2937             0
##  3 000650371845…  1997 06              06037          4       2445             0
##  4 000650591236…  1997 06              06059          4      20014             0
##  5 000650591236…  1999 06              06059          4       9549             0
##  6 000650612014…  1997 06              06061          4       6909             0
##  7 000650612014…  2002 06              06061          4       6909             0
##  8 364            2002 06              06085          1       1456             0
##  9 000650851236…  1997 06              06085          4       9728             0
## 10 367            1997 06              06087          2       5682             0
## # ℹ 48 more rows
## # ℹ 258 more variables: Total_Rev_Own_Sources <int>, General_Revenue <int>,
## #   Gen_Rev_Own_Sources <int>, Total_Taxes <int>, Property_Tax <int>,
## #   Tot_Sales___Gr_Rec_Tax <int>, Total_Gen_Sales_Tax <int>,
## #   Total_Select_Sales_Tax <int>, Public_Utility_Tax <int>, Tobacco_Tax <int>,
## #   Other_Select_Sales_Tax <int>, Total_Income_Taxes <int>,
## #   Individual_Income_Tax <int>, Taxes_NEC <int>, Total_IG_Revenue <int>, …

Reporting

temp_cleaned %>% 
  filter(Total_Revenue != 0) %>% 
  group_by(FIPSid) %>% 
  summarise(years = n()) %>% 
  ggplot() +
  geom_histogram(aes(x = years), binwidth = 3) + 
  ggtitle("Histogram of reported years over 31-year period between 1970-2021")

temp_cleaned %>% 
  filter(Total_Revenue != 0) %>% 
  group_by(year) %>% 
  summarise(reporting = n()) %>% 
  ggplot() +
  geom_bar(aes(x = year, weight = reporting)) + 
  ggtitle("Reporting over 31-year period between 1970-2021")

How many school districts per state

temp_short %>% 
  group_by(FIPS_Code_State) %>% 
  summarise(n_dist = n_distinct(FIPSid)) %>% 
  ggplot() +
  geom_col(aes(x = fct_rev(fct_reorder(as.factor(FIPS_Code_State), n_dist)), y = n_dist)) +
  labs(title = "Number of schools per state")

temp_short %>% 
  group_by(FIPS_Code_State) %>% 
  summarise(n_dist = n_distinct(FIPSid)) %>% 
  ggplot() +
  geom_histogram(aes(n_dist), bins = 10) + 
  labs(title = "Histogram of number of school districts per state")

Completeness

for(k in unique(temp_short$FIPS_Code_State)){
  print(k)
  print(get_state(k))
  temp_short %>% 
  filter(FIPS_Code_State == k) %>% 
  panelview(Total_Revenue != 0 ~ 1, index = c("FIPSid","year"), xlab = "Year", ylab = "School District", display.all = TRUE, type = "missing")
}
## [1] "01"
## [1] "AL"

## [1] "04"
## [1] "AZ"
## [1] "05"
## [1] "AR"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "06"
## [1] "CA"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "08"
## [1] "CO"

## [1] "09"
## [1] "CT"

## [1] "12"
## [1] "FL"
## [1] "17"
## [1] "IL"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "18"
## [1] "IN"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "20"
## [1] "KS"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "23"
## [1] "ME"

## [1] "25"
## [1] "MA"
## [1] "26"
## [1] "MI"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "27"
## [1] "MN"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "28"
## [1] "MS"
## [1] "29"
## [1] "MO"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "30"
## [1] "MT"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "31"
## [1] "NE"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "34"
## [1] "NJ"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "36"
## [1] "NY"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "38"
## [1] "ND"
## [1] "39"
## [1] "OH"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "40"
## [1] "OK"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "42"
## [1] "PA"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "45"
## [1] "SC"

## [1] "46"
## [1] "SD"
## [1] "48"
## [1] "TX"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "50"
## [1] "VT"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "54"
## [1] "WV"
## [1] "55"
## [1] "WI"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "10"
## [1] "DE"

## [1] "13"
## [1] "GA"

## [1] "16"
## [1] "ID"
## [1] "19"
## [1] "IA"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "21"
## [1] "KY"

## [1] "22"
## [1] "LA"

## [1] "32"
## [1] "NV"

## [1] "33"
## [1] "NH"

## [1] "35"
## [1] "NM"

## [1] "41"
## [1] "OR"

## [1] "44"
## [1] "RI"

## [1] "47"
## [1] "TN"

## [1] "49"
## [1] "UT"
## [1] "51"
## [1] "VA"
## Time is not evenly distributed (possibly due to missing data).

## [1] "53"
## [1] "WA"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "56"
## [1] "WY"

State by state cleaning…

av_states <- unique(temp_short$FIPS_Code_State)

sum_state <- function(df, loc){
  print(get_state(av_states[loc]))
  
  test <- temp_short %>% 
  filter(FIPS_Code_State == av_states[loc]) %>% 
  group_by(FIPSid) %>% 
  summarise(n = n()) %>% 
  filter(n != 25) %>% 
  nrow(.)
  
  if(test == 0){
    return(paste0(get_state(av_states[loc]), " is complete!"))
    }

  tmp <- df %>%  
    filter(FIPS_Code_State == av_states[loc])
  
  pt_all <- tmp %>% panelview(Total_Revenue != 0 ~ 1, index = c("FIPSid","year"), xlab = "Year", ylab = "School District", display.all = TRUE, type = "missing", main = get_state(av_states[loc]))
  pt_missing <- tmp %>% 
    group_by(FIPSid) %>% 
    filter(n() != 25) %>% 
    panelview(Total_Revenue != 0 ~ 1, index = c("FIPSid","year"), xlab = "Year", ylab = "School District", display.all = TRUE, type = "missing", main = get_state(av_states[loc]))
  
  grid.arrange(pt_all, pt_missing)
  
  print(paste0("Unique school district codes: ", n_distinct(tmp$FIPSid)))
  print(paste0("Incomplete school districts: ", test))
  
  return()
}

print_names <- function(state_df){
   test <- temp %>% 
    group_by(FIPSid) %>% 
    filter(n() != 25) %>% pull(FIPSid) %>% unique
   
  sch_names %>% 
  filter(FIPSid %in% test) %>% 
  select(FIPSid, fips, Name, year) %>% 
  arrange(year) %>%  
  pivot_wider(id_cols = c(FIPSid, fips), names_from = "year", values_from = "Name") %>%
  arrange(fips) %>% print(n = 300)
}

Cleaning excel

Creates excel in which notes about choices on how to handle missing data Only Delaware, Nevada, and Tennessee have complete panels for all school districts 1997-2021

missing_dat <- tibble("state_code" = character(),
                      "state" = character(), 
                      "n_incomplete" = numeric(), 
                      "n_total" = numeric(), 
                      "FIPSid" = character(),
                      "y_missing" = numeric(), 
                      "y_earliest" = numeric(), 
                      "y_latest" = numeric(), 
                      "notes" = character(),
                      "action" = character())

for(k in av_states){
  tmp <- temp_short %>% 
    filter(FIPS_Code_State == k)
    
  missing_dat <- tmp %>% 
    group_by(FIPSid) %>% 
    filter(n() != 25) %>% 
    summarise(y_missing = 25 - n(),
              y_earliest = min(year),
              y_latest = max(year), 
              notes = NA, 
              action = NA) %>% 
    mutate(state_code = k, 
         state = get_state(k),
         n_incomplete = n(),
         n_total = n_distinct(tmp$FIPSid), .before = 1) %>% 
    rbind(missing_dat, .)
}
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `y_earliest = min(year)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
## There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `y_earliest = min(year)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
## There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `y_earliest = min(year)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
#missing_dat %>% write.xlsx(here('documentation/missing_data_cleaning.xlsx'))

Standardise School district names

Duplicates remain….down from 19,026 duplicate entries to 4,955

sch_dist_names <- readRDS(here("data/temp/names_school_level_info.RDS")) %>% 
  filter(year >=1997) %>% 
  mutate(Name = case_when(FIPSid =="00185067116455" ~ "KOKOMO CENTER TOWNSHIP CONSOLIDATED SCHOOL CORPORATION", 
                          TRUE ~ Name))

temp_short %>% pull(FIPSid) %>% n_distinct
## [1] 15116
sch_names <- sch_dist_names %>% 
  # First handle abbreviations
  mutate(Name = 
                     gsub("DIST$", "DISTRICT", 
                     gsub("DIST ", "DISTRICT ",
                     gsub("DIS$", "DISTRICT",
                     gsub(" DST", " DISTRICT",
                     gsub("^FT", "FORT", 
                     gsub(" D$", " DISTRICT",
                     gsub(" DI$", " DISTRICT",
                     gsub(" DT$", " DISTRICT",
                     gsub(" DT ", "DISTRICT",
                     gsub("DISTR |DISTR$", "DISTRICT",
                    # Handle mispellings
                    gsub("DISRICT", "DISTRICT",
                    gsub("DISTT", "DISTRICT",
                    gsub("DISTRIC$", "DISTRICT",
                    gsub(" SCOOL$", " SCHOOL",
                    gsub(" SCOOL ", " SCHOOL ",
                    gsub(" SCOOLS", " SCHOOLS",
                    gsub(" Num", " ",
                    gsub("#", "",
                    gsub("  ", " ",
                    str_trim(gsub("[.]", " ", Name), side = "right"))))))))))))))))))))) %>% 
  mutate(Name = 
           gsub(" JT ", " JOINT ",
           gsub("ELEMENTARY SCHOOL DISTRICT", "SCHOOL DISTRICT",
           gsub(" SCHOOLS$", " SCHOOL DISTRICT",
           gsub(" SCH$", " SCHOOL DISTRICT",
           gsub(" ED ", " EDUCATION ",
           # Cooperative
           gsub("COOP ", "COOPERATIVE ",
           gsub(" COOP$", " COOPERATIVE",
           gsub("CO-OP ", "COOPERATIVE ",
           gsub("CO-OP$", "COOPERATIVE",
           # Bi-county
           gsub("BI-CO ", "BI-COUNTY ",
           # Elementary, High
           gsub("ELEM ", "ELEMENTARY ",
           # Unified
           gsub("UNIF ", "UNIFIED ",
           # Consolidated Independent
           gsub(" COMM CONSOLIDATED ", " COMMUNITY CONSOLIDATED ",
           gsub(" CONS ", " CONSOLIDATED ",
           gsub(" CONSOL ", "CONSOLIDATED ",
           gsub(" CONSOLID ", "CONSOLIDATED ",
           # Community Corporations
           gsub(" CORP$", " CORPORATION",
           gsub(" CORP ", " CORPORATION ",
           gsub(" COLL ", " COLLEGE ",
           gsub("CMTY", "COMMUNITY",
           gsub("COMMNTY", "COMMUNITY",
           gsub("CMNTY", "COMMUNITY",
           gsub(" COMM ", " COMMUNITY ",
           # Boards of Education
           gsub("BOARD PUBLIC INS", "SCHOOL DISTRICT",
           gsub("SCHOOL BOARD", "SCHOOL DISTRICT",
           gsub("BOARD OF EDUC", "SCHOOL DISTRICT",
           gsub("BOARD OF EDUCATION", "SCHOOL DISTRICT",
           gsub(" BD", " BOARD", 
           # School Districts
           gsub(" SCHS", " SCHOOLS",
           gsub(" PUB SCHS", " SCHOOLS",
           gsub(" SCHOOLS DISTRICT", " SCHOOL DISTRICT",
           gsub(" SCHS DISTRICT", " SCHOOL DISTRICT",
           gsub(" S$", " SCHOOL DISTRICT",
           gsub(" CO ", " COUNTY ",
           gsub(" CTY ", " CITY ", 
           gsub(" CY ", " CITY ",
           gsub(" SCH ", " SCHOOL ", 
           gsub(" SCH SCH ", " SCH ",
           Name))))))))))))))))))))))))))))))))))))))) %>% 
  # Mispelled words
  mutate(Name = gsub("MOUNTIAN", "MOUNTAIN", 
                     gsub("SCHOO ", "SCHOOL ", 
                          gsub("LAMOTTE", "LA MOTTE", 
                               gsub("SANDSPRINGS", "SAND SPRINGS", Name))))) %>% 
  # Remove numerical values
  mutate(Name = str_trim(gsub("[[:digit:]]", "", Name), side = "right")) #%>% 
  # select(FIPSid, Name) %>% 
  # distinct %>% 
  # group_by(FIPSid) %>% 
  # filter(n() > 1) %>% 
  # arrange(FIPSid) %>% View()
for(k in 1:length(av_states)){
  sum_state(temp_short, k)
}
## [1] "AL"

## [1] "Unique school district codes: 138"
## [1] "Incomplete school districts: 11"
## [1] "AZ"

## [1] "Unique school district codes: 250"
## [1] "Incomplete school districts: 29"
## [1] "AR"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 328"
## [1] "Incomplete school districts: 95"
## [1] "CA"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 1178"
## [1] "Incomplete school districts: 206"
## [1] "CO"

## [1] "Unique school district codes: 209"
## [1] "Incomplete school districts: 33"
## [1] "CT"

## [1] "Unique school district codes: 50"
## [1] "Incomplete school districts: 50"
## [1] "FL"
## Time is not evenly distributed (possibly due to missing data).

## [1] "Unique school district codes: 131"
## [1] "Incomplete school districts: 36"
## [1] "IL"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 1120"
## [1] "Incomplete school districts: 265"
## [1] "IN"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 326"
## [1] "Incomplete school districts: 40"
## [1] "KS"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 363"
## [1] "Incomplete school districts: 61"
## [1] "ME"

## [1] "Unique school district codes: 140"
## [1] "Incomplete school districts: 72"
## [1] "MA"

## [1] "Unique school district codes: 93"
## [1] "Incomplete school districts: 13"
## [1] "MI"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 641"
## [1] "Incomplete school districts: 83"
## [1] "MN"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 449"
## [1] "Incomplete school districts: 133"
## [1] "MS"

## [1] "Unique school district codes: 171"
## [1] "Incomplete school districts: 28"
## [1] "MO"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 554"
## [1] "Incomplete school districts: 26"
## [1] "MT"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 387"
## [1] "Incomplete school districts: 92"
## [1] "NE"
## If the number of units is more than 300, we set "gridOff = TRUE".

## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 713"
## [1] "Incomplete school districts: 476"
## [1] "NJ"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 561"
## [1] "Incomplete school districts: 43"
## [1] "NY"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 721"
## [1] "Incomplete school districts: 52"
## [1] "ND"

## [1] "Unique school district codes: 273"
## [1] "Incomplete school districts: 105"
## [1] "OH"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 796"
## [1] "Incomplete school districts: 137"
## [1] "OK"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 593"
## [1] "Incomplete school districts: 87"
## [1] "PA"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 646"
## [1] "Incomplete school districts: 135"
## [1] "SC"

## [1] "Unique school district codes: 106"
## [1] "Incomplete school districts: 30"
## [1] "SD"

## [1] "Unique school district codes: 192"
## [1] "Incomplete school districts: 45"
## [1] "TX"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 1114"
## [1] "Incomplete school districts: 52"
## [1] "VT"
## If the number of units is more than 300, we set "gridOff = TRUE".

## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 393"
## [1] "Incomplete school districts: 329"
## [1] "WV"

## [1] "Unique school district codes: 74"
## [1] "Incomplete school districts: 19"
## [1] "WI"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 444"
## [1] "Incomplete school districts: 9"
## [1] "DE"
## [1] "GA"

## [1] "Unique school district codes: 196"
## [1] "Incomplete school districts: 17"
## [1] "ID"

## [1] "Unique school district codes: 120"
## [1] "Incomplete school districts: 8"
## [1] "IA"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 406"
## [1] "Incomplete school districts: 67"
## [1] "KY"

## [1] "Unique school district codes: 176"
## [1] "Incomplete school districts: 5"
## [1] "LA"

## [1] "Unique school district codes: 69"
## [1] "Incomplete school districts: 3"
## [1] "NV"
## [1] "NH"

## [1] "Unique school district codes: 171"
## [1] "Incomplete school districts: 11"
## [1] "NM"
## Time is not evenly distributed (possibly due to missing data).

## [1] "Unique school district codes: 96"
## [1] "Incomplete school districts: 1"
## [1] "OR"

## [1] "Unique school district codes: 266"
## [1] "Incomplete school districts: 43"
## [1] "RI"

## [1] "Unique school district codes: 8"
## [1] "Incomplete school districts: 4"
## [1] "TN"
## [1] "UT"

## [1] "Unique school district codes: 41"
## [1] "Incomplete school districts: 1"
## [1] "VA"
## Time is not evenly distributed (possibly due to missing data).

## Time is not evenly distributed (possibly due to missing data).

## [1] "Unique school district codes: 1"
## [1] "Incomplete school districts: 1"
## [1] "WA"
## If the number of units is more than 300, we set "gridOff = TRUE".

## [1] "Unique school district codes: 305"
## [1] "Incomplete school districts: 10"
## [1] "WY"
## Time is not evenly distributed (possibly due to missing data).

## [1] "Unique school district codes: 57"
## [1] "Incomplete school districts: 2"

Alabama

temp <- sum_state(temp_short, 1)
## [1] "AL"

## [1] "Unique school district codes: 138"
## [1] "Incomplete school districts: 11"

Arizona

temp <- sum_state(temp_short, 2)
## [1] "AZ"

## [1] "Unique school district codes: 250"
## [1] "Incomplete school districts: 29"
#sch_names %>% 
#  filter(FIPSid %in% test) %>% 
#  select(FIPSid, fips, Name, year) %>% 
#  arrange(year) %>%  
#  pivot_wider(id_cols = c(FIPSid, fips), names_from = "year", values_from = "Name") %>%
#  arrange(fips) %>% print(n = 29)

Create map of school districts and expenditure